#coding:utf-8

import re
import time
import json
import random
import requests
#import IP_Pool as ip
#from bs4 import BeautifulSoup

'''
function: 获取html网页信息
return: 返回得到的html
'''
def getHTML(url):
    # 化个妆,避免被认出是爬虫
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.5,zh;q=0.3',
        'Referer': 'https://www.jd.com/',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'TE': 'Trailers',
    }
    try:
        r = requests.get(url, timeout=30, headers=headers)
        return r.text
    except:
        print("爬取网页失败辣")
        return ""


'''
function: 用于获得查询商品的ID和callback参数
return: 返回采集的商品ID列表和callback列表
'''
def getProductID(html):
    callBack = [];error=[]
    productID = re.findall(r'data-sku=\".*?\"',html)
    productID = list(set(productID))    # 去除重复的商品ID
    for i in range(len(productID)):
        productID[i] = productID[i].split('=')[1][1:-1]
        tmp = getCallback(productID[i])
        if tmp == 'null':
            error.append(i)    # 记录没有callback的ID在productID的位置
        else:
            callBack.append(tmp)
    newProductID = []
    # 剔除没有callback的ID
    for i in range(len(productID)):
        if i not in error:
            newProductID.append(productID[i])
    return newProductID,callBack


'''
function: 获得商品callback参数
return: 返回callback参数
'''
def getCallback(productID):
    url = 'https://item.jd.com/'+str(productID)+'.html'
    html = getHTML(url)
    try:
        callback = re.findall(r"commentVersion.*?,",html)
        callback = callback[0].split(':')[1][1:-2]
        return callback
    except:
        # 存在部分商品没有callback参数
        return 'null'


def collectInfo():
        keyword = input('请输入商品关键字:')
        commType = input('采集评价类型<好评:3,中评:2,差评:1>:')
    
        url = 'https://search.jd.com/Search?keyword='+keyword+'&enc=utf-8&psort=3&page=2'
        html = getHTML(url)
        productID,callBack = getProductID(html)
        for i in range(80):
            #ipAddress = ip.getIP()  # 使用高匿代理IP
            for j in range(len(productID)):
                url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv'\
                      +str(callBack[j])+'&productId='+productID[j]+'&score='+str(commType)+'&sortType=5&page='\
                      +str(i)+'&pageSize=10&isShadowSku=0&fold=1'
                jsonHtml = requests.get(url)#, proxies=ipAddress)
                if len(jsonHtml.text) == 0:
                    print('数据没拿过来！')
                try:
                    # 查看json文件前的数据长度
                    length = len(jsonHtml.text[:35].split('(')[0])+1
                    data = json.loads(jsonHtml.text[length:-2])
                    
                    for m in data['comments']:
                        content = m['content']
                        if len(content)<100 and len(content)>8:
                            print("评论内容:",content)
                            txtType = {'1':'差评','2':'中评','3':'好评'}
                            filename = txtType[commType]+'.txt'
                            file = open(filename, 'a')
                            file.write(content+'\n')
                        time.sleep(0.6)   # 缓解京东服务器压力
                except:
                    print('\n-----------------出现一个小错-----------------\n')
                    pass



'''productName = ['数码相机','耳机','手表','U盘','手办','手机配件','海贼王','插线板','火影忍者','单反','','充电宝',\
               '平板电脑','学习机','笔记本电脑','内存条','音箱','鼠标','SSD硬盘','游戏机','手环','游戏手柄','键盘','鼠标垫','路由器','保温杯','电子钟','刮胡刀','机顶盒',\
               '内衣','童装','童鞋','化妆品','洗护套装','女士鞋','洗漱用品','自行车','钓鱼配件','乐器','积木','娃娃','保健品','眼睛','邮票','笔','白酒','文具盒']
'''

#productName = ['童鞋','化妆品','洗护套装','洗漱用品','自行车','钓鱼配件','乐器','积木','娃娃','保健品','邮票','笔','白酒','文具盒','童装']

collectInfo()

# https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv'+8835+'&productId='+产品ID+'&score=1&sortType=5&page='+页码+'&pageSize=10&isShadowSku=0&fold=1
# url = 'https://club.jd.com/productpage/p-100000177760-s-0-t-0-p-1.html'

